Data Inspection¶

In [1]:
import plotly
plotly.offline.init_notebook_mode()
In [2]:
import pandas as pd
from tqdm import tqdm

df_list = list()
chunk_iter = pd.read_csv(
    "../data/Total_Data_10Y_Top24.csv", 
    chunksize=100000, 
    dtype = {"CANCELLATION_CODE": str}
)
for chunk in tqdm(chunk_iter):
    df_list.append(chunk)
df = pd.concat(df_list)
df.head()
130it [00:29,  4.45it/s]
Out[2]:
FL_DATE OP_UNIQUE_CARRIER TAIL_NUM ORIGIN_AIRPORT_SEQ_ID ORIGIN_CITY_MARKET_ID ORIGIN ORIGIN_CITY_NAME DEST_AIRPORT_SEQ_ID DEST_CITY_MARKET_ID DEST ... TAXI_IN ARR_TIME ARR_DELAY CANCELLED CANCELLATION_CODE CARRIER_DELAY WEATHER_DELAY NAS_DELAY SECURITY_DELAY LATE_AIRCRAFT_DELAY
0 2014-07-01 AA N002AA 1105703 31057 CLT Charlotte, NC 1129803 30194 DFW ... 28.0 1214.0 9.0 0.0 NaN NaN NaN NaN NaN NaN
1 2014-07-01 AA N002AA 1129803 30194 DFW Dallas/Fort Worth, TX 1105703 31057 CLT ... 13.0 945.0 0.0 0.0 NaN NaN NaN NaN NaN NaN
2 2014-07-01 AA N004AA 1039705 30397 ATL Atlanta, GA 1129803 30194 DFW ... 6.0 1341.0 -9.0 0.0 NaN NaN NaN NaN NaN NaN
3 2014-07-01 AA N004AA 1129803 30194 DFW Dallas/Fort Worth, TX 1039705 30397 ATL ... 7.0 1159.0 4.0 0.0 NaN NaN NaN NaN NaN NaN
4 2014-07-01 AA N004AA 1129803 30194 DFW Dallas/Fort Worth, TX 1039705 30397 ATL ... 6.0 2317.0 2.0 0.0 NaN NaN NaN NaN NaN NaN

5 rows × 24 columns

In [3]:
df.describe()
Out[3]:
ORIGIN_AIRPORT_SEQ_ID ORIGIN_CITY_MARKET_ID DEST_AIRPORT_SEQ_ID DEST_CITY_MARKET_ID DEP_TIME DEP_DELAY TAXI_OUT TAXI_IN ARR_TIME ARR_DELAY CANCELLED CARRIER_DELAY WEATHER_DELAY NAS_DELAY SECURITY_DELAY LATE_AIRCRAFT_DELAY
count 1.292656e+07 1.292656e+07 1.292656e+07 1.292656e+07 1.271494e+07 1.271486e+07 1.271048e+07 1.270744e+07 1.270744e+07 1.268139e+07 1.292656e+07 2.559603e+06 2.559603e+06 2.559603e+06 2.559603e+06 2.559603e+06
mean 1.298824e+06 3.161338e+04 1.298812e+06 3.161370e+04 1.332409e+03 1.127152e+01 1.722694e+01 8.678440e+00 1.471801e+03 5.409690e+00 1.675551e-02 2.041558e+01 2.650336e+00 1.520350e+01 1.268451e-01 2.413592e+01
std 1.453419e+05 1.168922e+03 1.453231e+05 1.168763e+03 5.192380e+02 4.514114e+01 9.141093e+00 6.704624e+00 5.554134e+02 4.738376e+01 1.283540e-01 5.935956e+01 2.208897e+01 3.191662e+01 3.210375e+00 5.163883e+01
min 1.039705e+06 3.019400e+04 1.039705e+06 3.019400e+04 1.000000e+00 -2.340000e+02 0.000000e+00 1.000000e+00 1.000000e+00 -2.380000e+02 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
25% 1.129806e+06 3.046600e+04 1.129806e+06 3.046600e+04 9.060000e+02 -5.000000e+00 1.200000e+01 5.000000e+00 1.052000e+03 -1.400000e+01 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
50% 1.289208e+06 3.145400e+04 1.289208e+06 3.145400e+04 1.324000e+03 -1.000000e+00 1.500000e+01 7.000000e+00 1.515000e+03 -5.000000e+00 0.000000e+00 3.000000e+00 0.000000e+00 2.000000e+00 0.000000e+00 0.000000e+00
75% 1.410702e+06 3.245700e+04 1.410702e+06 3.245700e+04 1.751000e+03 9.000000e+00 2.000000e+01 1.000000e+01 1.930000e+03 9.000000e+00 0.000000e+00 2.000000e+01 0.000000e+00 1.900000e+01 0.000000e+00 2.800000e+01
max 1.501606e+06 3.481900e+04 1.501606e+06 3.481900e+04 2.400000e+03 3.695000e+03 2.270000e+02 1.419000e+03 2.400000e+03 3.680000e+03 1.000000e+00 3.359000e+03 2.692000e+03 1.511000e+03 9.870000e+02 3.581000e+03
In [4]:
df.FL_DATE.max()
Out[4]:
'2024-06-30'
In [5]:
df.nunique()
Out[5]:
FL_DATE                  3653
OP_UNIQUE_CARRIER          20
TAIL_NUM                 9140
ORIGIN_AIRPORT_SEQ_ID      58
ORIGIN_CITY_MARKET_ID      20
ORIGIN                     24
ORIGIN_CITY_NAME           24
DEST_AIRPORT_SEQ_ID        58
DEST_CITY_MARKET_ID        20
DEST                       24
DEST_CITY_NAME             24
DEP_TIME                 1440
DEP_DELAY                1783
TAXI_OUT                  195
TAXI_IN                   247
ARR_TIME                 1440
ARR_DELAY                1806
CANCELLED                   2
CANCELLATION_CODE           4
CARRIER_DELAY            1558
WEATHER_DELAY             971
NAS_DELAY                 855
SECURITY_DELAY            268
LATE_AIRCRAFT_DELAY      1228
dtype: int64
In [6]:
df.dtypes
Out[6]:
FL_DATE                   object
OP_UNIQUE_CARRIER         object
TAIL_NUM                  object
ORIGIN_AIRPORT_SEQ_ID      int64
ORIGIN_CITY_MARKET_ID      int64
ORIGIN                    object
ORIGIN_CITY_NAME          object
DEST_AIRPORT_SEQ_ID        int64
DEST_CITY_MARKET_ID        int64
DEST                      object
DEST_CITY_NAME            object
DEP_TIME                 float64
DEP_DELAY                float64
TAXI_OUT                 float64
TAXI_IN                  float64
ARR_TIME                 float64
ARR_DELAY                float64
CANCELLED                float64
CANCELLATION_CODE         object
CARRIER_DELAY            float64
WEATHER_DELAY            float64
NAS_DELAY                float64
SECURITY_DELAY           float64
LATE_AIRCRAFT_DELAY      float64
dtype: object

Data Cleaning¶

In [7]:
import matplotlib.pyplot as plt
target_df = df.drop(columns = [
    "OP_UNIQUE_CARRIER",
    "ORIGIN_AIRPORT_SEQ_ID",
    "ORIGIN_CITY_MARKET_ID",
    "ORIGIN",
    "ORIGIN_CITY_NAME",
    "ORIGIN_AIRPORT_SEQ_ID",
    "DEST_AIRPORT_SEQ_ID",
    "DEST_CITY_MARKET_ID",
    "DEST",
    "DEST_CITY_NAME",
    "DEST_AIRPORT_SEQ_ID",
    "DEP_TIME",
    "TAXI_OUT",
    "TAXI_IN",
    "ARR_TIME",
])
del df
target_df['FL_DATE'] = pd.to_datetime(target_df['FL_DATE'])
target_df.isnull().mean()
Out[7]:
FL_DATE                0.000000
TAIL_NUM               0.004830
DEP_DELAY              0.016377
ARR_DELAY              0.018966
CANCELLED              0.000000
CANCELLATION_CODE      0.983244
CARRIER_DELAY          0.801989
WEATHER_DELAY          0.801989
NAS_DELAY              0.801989
SECURITY_DELAY         0.801989
LATE_AIRCRAFT_DELAY    0.801989
dtype: float64
In [8]:
plane_set_df = target_df.TAIL_NUM.drop_duplicates()
plane_review = pd.read_csv("../supplementary/N-Number-Registration-Data-2024.csv")
plane_review['N-NUMBER'] = plane_review['N-NUMBER'].apply(lambda x:"N"+x)
plane_review = plane_review[plane_review['N-NUMBER'].isin(plane_set_df)]
plane_review
Out[8]:
N-NUMBER COUNTRY TYPE AIRCRAFT TYPE ENGINE MFR MODEL NO-ENG NO-SEATS AC-WEIGHT MFR_ENGINE MODEL_ENGINE HORSEPOWER THRUST
651 N10156 US Fixed wing multi engine Turbo-fan EMBRAER EMB-145XR 2 55 12,500 - 19,999 ROLLS-ROYC AE3007 SER 0.0 6442.0
813 N101DQ US Fixed wing multi engine Turbo-fan AIRBUS A321-211 2 199 20,000 and over CFM INTL CFM56-5B3/3 0.0 32000.0
816 N101DU US Fixed wing multi engine Turbo-fan C SERIES AIRCRAFT LTD PTNRSP BD-500-1A10 2 133 20,000 and over P & W PW1519G 0.0 19775.0
874 N101HQ US Fixed wing multi engine Turbo-fan EMBRAER-EMPRESA BRASILEIRA DE ERJ 170-200 LR 2 80 20,000 and over GE CF34-8E5 0.0 14510.0
946 N101NN US Fixed wing multi engine Turbo-fan AIRBUS INDUSTRIE A321-231 2 379 20,000 and over IAE V2500SERIES 0.0 25000.0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
297051 N998AN US Fixed wing multi engine Turbo-fan AIRBUS A321-231 2 379 20,000 and over IAE V2533-A5 0.0 31600.0
297052 N998AT US Fixed wing multi engine Turbo-fan BOEING 717-200 2 100 20,000 and over ROLLS-ROYC TAY 651-54 0.0 15400.0
297102 N998JE US Fixed wing multi engine Turbo-fan AIRBUS A321-231 2 379 20,000 and over IAE V2533-A5 0.0 31600.0
297126 N998NN US Fixed wing multi engine Turbo-fan BOEING 737-800 2 175 20,000 and over CFM INTL CFM56-7B24E 0.0 24200.0
297463 N999JQ US Fixed wing multi engine Turbo-fan AIRBUS A321-231 2 379 20,000 and over IAE V2533-A5 0.0 31600.0

6798 rows × 13 columns

In [9]:
target_airport_df = pd.merge(
    target_df, 
    plane_review,
    how = "inner", 
    left_on = "TAIL_NUM", 
    right_on = "N-NUMBER",
    suffixes = ["", "_origin"]
)

target_airport_df.drop(columns = ['N-NUMBER'], inplace = True)
target_airport_df['CANCELLATION_CODE'] = target_airport_df['CANCELLATION_CODE'].fillna("Not")
target_airport_df.head()
Out[9]:
FL_DATE TAIL_NUM DEP_DELAY ARR_DELAY CANCELLED CANCELLATION_CODE CARRIER_DELAY WEATHER_DELAY NAS_DELAY SECURITY_DELAY ... TYPE ENGINE MFR MODEL NO-ENG NO-SEATS AC-WEIGHT MFR_ENGINE MODEL_ENGINE HORSEPOWER THRUST
0 2014-07-01 N200AA 3.0 -13.0 0.0 Not NaN NaN NaN NaN ... Turbo-prop TEXTRON AVIATION INC 208B 1 12 Up to 12,499 P&W CANADA PT6A-140 867.0 0.0
1 2014-07-01 N200AA 5.0 1.0 0.0 Not NaN NaN NaN NaN ... Turbo-prop TEXTRON AVIATION INC 208B 1 12 Up to 12,499 P&W CANADA PT6A-140 867.0 0.0
2 2014-07-01 N201AA -5.0 -3.0 0.0 Not NaN NaN NaN NaN ... Reciprocating CESSNA 150 1 2 Up to 12,499 CONT MOTOR 0-200 SERIES 100.0 0.0
3 2014-07-01 N201AA -5.0 0.0 0.0 Not NaN NaN NaN NaN ... Reciprocating CESSNA 150 1 2 Up to 12,499 CONT MOTOR 0-200 SERIES 100.0 0.0
4 2014-07-01 N201AA -1.0 -2.0 0.0 Not NaN NaN NaN NaN ... Reciprocating CESSNA 150 1 2 Up to 12,499 CONT MOTOR 0-200 SERIES 100.0 0.0

5 rows × 23 columns

Visualization¶

Review Performance per Airport¶

In [10]:
train_columns = [
    'NO-ENG',
    'NO-SEATS',
    'HORSEPOWER',
    'THRUST'
]
label_columns = [
    "DEP_DELAY",
    "ARR_DELAY",
    "CARRIER_DELAY",
    "WEATHER_DELAY",
    "NAS_DELAY",
    "SECURITY_DELAY",
    "LATE_AIRCRAFT_DELAY",
]
In [13]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

cat_columns = ['COUNTRY', 'TYPE AIRCRAFT', 'TYPE ENGINE', 'AC-WEIGHT']
num_columns = ['NO-ENG','NO-SEATS','HORSEPOWER','THRUST'] + label_columns

# Create dummy variables for categorical columns
dummy_df = pd.get_dummies(target_airport_df[cat_columns])
# Combine dummy variables with numerical columns
new_df = pd.concat([target_airport_df[num_columns], dummy_df], axis=1)
# Assuming the DataFrame is called 'df'
corr_matrix = new_df.corr(method = "spearman")
corr_matrix = corr_matrix.loc[corr_matrix.columns.difference(label_columns), label_columns]

# Create a heatmap using Seaborn
sns.set(style="white")
plt.figure(figsize=(16,10))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.title('Correlation Matrix')
plt.show()
No description has been provided for this image